Virtual useR! 2020





Parfait Gasana

Data Analyst, Winston & Strawn LLP
Chicago, IL
@Parfait (StackOverflow) |   @ParfaitG (GitHub)


Data Science Challenges




Relational Database Solution






Use Case: Environment Database

Human Impact on the Global and Local Biosphere and Climate



Connection

ODBC

library(DBI)
library(odbc)

conn <- dbConnect(odbc::odbc(), driver="PostgreSQL Unicode", 
                  server="10.0.0.220", database="environment",
                  uid="postgres", pwd="env19", port=6432)
dbListTables(conn)
##  [1] "ag_census"             "arable_land"          
##  [3] "consumption"           "fws_species_count"    
##  [5] "fws_species_year"      "global_mean_sea_level"
##  [7] "global_temperature"    "groundwater"          
##  [9] "iucn_species_count"    "ocean_data"           
## [11] "ocean_data_vw"         "plants_assessments"   
## [13] "ppm_month"             "ppm_week"             
## [15] "sea_ice_extent"        "us_co2_emissions"     
## [17] "us_gdp"                "us_renewable_energy"  
## [19] "usa_population"        "world_co2_emissions"  
## [21] "world_gdp"             "world_population"
dbDisconnect(conn)

JDBC

library(rJava)
library(RJDBC)

drv <- JDBC("org.postgresql.Driver",
           "/usr/lib/jvm/java-8-oracle/lib/postgresql-42.2.2.jar")
conn <- dbConnect(drv, "jdbc:postgresql://10.0.0.220:6432/environment", "postgres", "env19")
dbListTables(conn, schema="public")
##  [1] "ag_census_pkey"               "arable_land_pkey"            
##  [3] "co2_emissions_pkey"           "consumption_pkey"            
##  [5] "fws_species_count_pkey"       "fws_species_year_pkey"       
##  [7] "global_mean_sea_level_pkey"   "global_temperature_pkey"     
##  [9] "groundwater_pkey"             "iucn_species_count_pkey"     
## [11] "ocean_data_pkey"              "plants_assessments_pkey"     
## [13] "ppm_month_pkey"               "ppm_week_pkey"               
## [15] "sea_ice_extent_pkey"          "us_gdp_pkey"                 
## [17] "us_renewable_energy_pkey"     "usa_population_pkey"         
## [19] "world_co2_emissions_pkey"     "world_gdp_pkey"              
## [21] "world_population_pkey"        "ag_census_id_seq"            
## [23] "arable_land_id_seq"           "co2_emissions_id_seq"        
## [25] "consumption_id_seq"           "fws_species_count_id_seq"    
## [27] "fws_species_year_id_seq"      "global_mean_sea_level_id_seq"
## [29] "global_temperature_id_seq"    "groundwater_id_seq"          
## [31] "iucn_species_count_id_seq"    "ocean_data_id_seq"           
## [33] "plants_assessments_id_seq"    "ppm_month_id_seq"            
## [35] "ppm_week_id_seq"              "sea_ice_extent_id_seq"       
## [37] "us_gdp_id_seq"                "us_renewable_energy_id_seq"  
## [39] "usa_population_id_seq"        "world_co2_emissions_id_seq"  
## [41] "world_gdp_id_seq"             "world_population_id_seq"     
## [43] "ag_census"                    "arable_land"                 
## [45] "consumption"                  "fws_species_count"           
## [47] "fws_species_year"             "global_mean_sea_level"       
## [49] "global_temperature"           "groundwater"                 
## [51] "iucn_species_count"           "ocean_data"                  
## [53] "plants_assessments"           "ppm_month"                   
## [55] "ppm_week"                     "sea_ice_extent"              
## [57] "us_co2_emissions"             "us_gdp"                      
## [59] "us_renewable_energy"          "usa_population"              
## [61] "world_co2_emissions"          "world_gdp"                   
## [63] "world_population"             "ocean_data_vw"
dbDisconnect(conn)
## [1] TRUE

RPostgreSQL API

library(RPostgreSQL)

conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="environment",
                  user="postgres", password="env19", port=6432)
dbListTables(conn)
##  [1] "ag_census"             "arable_land"          
##  [3] "consumption"           "fws_species_count"    
##  [5] "fws_species_year"      "global_mean_sea_level"
##  [7] "global_temperature"    "groundwater"          
##  [9] "iucn_species_count"    "ocean_data"           
## [11] "plants_assessments"    "ppm_month"            
## [13] "ppm_week"              "sea_ice_extent"       
## [15] "us_gdp"                "us_renewable_energy"  
## [17] "usa_population"        "world_co2_emissions"  
## [19] "world_gdp"             "world_population"     
## [21] "us_co2_emissions"

Data Migration

Schema

CREATE TABLE global_temperature (
   id SERIAL NOT NULL PRIMARY KEY,
   year INT,
   period VARCHAR(50),
   global_mean NUMERIC(5,2)
);

COPY global_temperature (year, period, global_mean) FROM '/path/to/csv/global_temperature.csv' DELIMITER ',' CSV HEADER;

Documentation

COMMENT ON TABLE global_temperature IS 'Source: NASA - Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, LOTI) (https://data.giss.nasa.gov/gistemp/)';
SELECT relname as table, obj_description(oid) as comment
FROM pg_class
WHERE relkind = 'r'
  AND obj_description(oid) IS NOT NULL
  ORDER BY relname
table comment
ag_census U.S. Department of Agriculture, Census Data Query Tool (CDQT): https://www.nass.usda.gov/Quick_Stats/CDQT/chapter/1/table/1
arable_land Source: World Bank - Arable land (% of land area) (https://data.worldbank.org/indicator/AG.LND.ARBL.ZS?end=2016&start=1961&view=chart)
consumption Source: EIA - Monthly Energy Review, publication of recent and historical energy statistics (https://www.eia.gov/totalenergy/data/monthly/)
fws_species_count Source: US FWS - U.S. Federal Endangered and Threatened Species (https://ecos.fws.gov/ecp/species-reports)
fws_species_year Source: US FWS - U.S. Federal Endangered and Threatened Species (https://ecos.fws.gov/ecp/species-reports)
global_mean_sea_level Source: NASA - Global Mean Sea Level (mm): https://climate.nasa.gov/vital-signs/sea-level/
global_temperature Source: NASA - Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, LOTI) (https://data.giss.nasa.gov/gistemp/)
groundwater U.S. Geological Survey (USGS) Groundwater-Level Annual Statistics for the Nation: https://waterdata.usgs.gov/nwis/annual?referred_module=gw&search_criteria=site_tp_cd&submitted_form=introduction
iucn_species_count Source: IUCN - Red List of Threatened Species (https://www.iucnredlist.org/resources/summary-statistics#Summary%20Tables)
ocean_data Source: NOAA - Global Ocean Data Analysis Project (https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.nodc:0162565#)
plants_assessments Source: Botanic Gardens Conservation International - Plant Threat Search: https://tools.bgci.org/threat_search.php
ppm_month Source: NOAA - Mauna Loa CO2 records Carbon PPM (https://www.esrl.noaa.gov/gmd/ccgg/trends/data.html)
ppm_week Source: NOAA - Mauna Loa CO2 records Carbon PPM (https://www.esrl.noaa.gov/gmd/ccgg/trends/data.html)
sea_ice_extent Source: NSIDC - Sea Ice Index Data (https://nsidc.org/data/seaice_index/archives)
us_gdp Source: Federal Reserve Bank of St. Louis - US Real Gross Domestic Product by Industry: https://fred.stlouisfed.org/categories/33045
us_renewable_energy Source: EIA - Renewable Energy Production and Consumption by Source: https://www.eia.gov/totalenergy/data/monthly/
usa_population Source: Federal Reserve Bank of St. Louis - U.S. Monthly Population (https://fred.stlouisfed.org/series/POPTHM)
world_co2_emissions Source: World Bank - World CO2 emissions (kt): https://data.worldbank.org/indicator/EN.ATM.CO2E.KT?end=2014&start=1960&view=chart
world_gdp Source: World Bank - World GDP (Current US$): https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
world_population Source: World Bank - Arable land (% of land area) (https://data.worldbank.org/indicator/AG.LND.ARBL.ZS?end=2016&start=1961&view=chart)

Data Analysis



Single Aggregation

SELECT p.date_year,
       sum(p.average_ppm) as carbon_ppm_total, 
       avg(p.average_ppm) as carbon_ppm_mean, 
       sum(c.energy_consumed) as btu_consumed_total,
       avg(c.energy_consumed) as btu_consumed_mean,
       sum(e.energy_co2) as co2_emissions_total,
       avg(e.energy_co2) as co2_emissions_mean
  FROM ppm_month p
  JOIN consumption c 
    ON p.date_year = c.date_year AND p.date_month = c.date_month and c.msn = 'TXRCBUS'
  JOIN us_co2_emissions e
    ON p.date_year = e.date_year AND p.date_month = e.date_month and e.msn = 'TETCEUS'
GROUP BY p.date_year
ORDER BY p.date_year
date_year carbon_ppm_total carbon_ppm_mean btu_consumed_total btu_consumed_mean co2_emissions_total co2_emissions_mean
38 2010 4678.79 389.8992 6641.355 553.4462 5585.741 465.4784
39 2011 4699.83 391.6525 6473.666 539.4722 5446.133 453.8444
40 2012 4726.24 393.8533 5684.503 473.7086 5237.300 436.4417
41 2013 4758.25 396.5208 6689.368 557.4473 5363.018 446.9182
42 2014 4783.77 398.6475 7007.139 583.9283 5411.193 450.9327
43 2015 4810.01 400.8342 6465.092 538.7577 5264.776 438.7313
44 2016 4850.87 404.2392 6033.098 502.7582 5172.402 431.0335
45 2017 4878.64 406.5533 6111.580 509.2983 5130.589 427.5491
46 2018 4902.26 408.5217 6896.516 574.7097 5267.750 438.9792
47 2019 410.83 410.8300 1163.200 1163.2000 493.839 493.8390

Multiple Join

SELECT p.date_year, 
       p.date_month, 
       CONCAT(p.date_year, '-', p.date_month, '-01')::date as date_day,
       p.average_ppm as "carbon ppm", 
       c.energy_consumed as "btu consumed", 
       e.energy_co2 as "co2 emissions"
  FROM ppm_month p
  JOIN consumption c 
    ON p.date_year = c.date_year AND p.date_month = c.date_month and c.msn = 'TXRCBUS'
  JOIN us_co2_emissions e
    ON p.date_year = e.date_year AND p.date_month = e.date_month and e.msn = 'TETCEUS'
  ORDER BY p.date_year, p.date_month

Time Series Plot of Carbon PPM, US Energy Consumption and US CO2 Emissions

metric_ts <- xts(x=metrics_df[c("carbon ppm", "btu consumed", "co2 emissions")], 
                 order.by=metrics_df$date_day)

plot(metric_ts, main = "Energy and Carbon PPM Metrics",
     legend.loc="bottomright", 
     col = c("red", "blue", "darkgreen"),
     yaxis.right=FALSE,
     axes=FALSE,
     lwd=1,
     cex.main=3,
     major.ticks="years",
     major.format="%Y",
     minor.format="%Y",
     grid.ticks.lty=1)

Renewable Energy: Production and Consumption


SELECT energy_type,
       date,
       SUM(production) AS production,
       SUM(consumption) AS consumption
FROM us_renewable_energy
GROUP BY energy_type,
         date
ORDER BY energy_type,
         date
energy_type date production consumption
4678 Wood Energy 2019-02-01 190.887 182.491
4679 Wood Energy 2019-03-01 198.621 191.507
4680 Wood Energy 2019-04-01 195.791 187.670
4681 Wood Energy 2019-05-01 201.743 193.775
4682 Wood Energy 2019-06-01 198.379 189.036
4683 Wood Energy 2019-07-01 205.023 196.873

Multiple Time Series Plots by Energy Type

par(mfrow=c(3,3), mar=c(5, 5, 2, 1))
output <- by(renewable_df, renewable_df$energy_type, function(sub) {
  
  metric_ts <- xts(x=sub[c("production", "consumption")], 
                   order.by=sub$date)
  
  print(plot(metric_ts, main = sub$energy_type[1],
             legend.loc="bottomright", 
             col = seaborn_palette[c(1,4)],
             yaxis.right=FALSE,
             axes=FALSE,
             lwd=1,
             cex.main=3,
             major.ticks="years",
             major.format="%Y",
             minor.format="%Y",
             grid.ticks.lty=1))
  
})

Dual Aggregation

WITH c1 AS
  (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
           REPLACE(description, 'Primary Energy Consumed by the ', '') AS "Sector",
           SUM(energy_consumed) AS "Primary Energy Consumed"
   FROM consumption
   WHERE msn IN ('TXICBUS', 'TXRCBUS', 'TXACBUS', 'TXCCBUS', 'TXEIBUS')
   GROUP BY CONCAT((date_year/10)::int * 10, 's'), 
            REPLACE(description, 'Primary Energy Consumed by the ', '')
  ),
   c2 AS
  (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
           REPLACE(REPLACE(description, 'Total Energy ', ''), ' CO2 Emissions', '') AS "Sector",
           SUM(energy_co2) AS "Total CO2 Emissions"
   FROM us_co2_emissions
   WHERE msn IN ('TERCEUS', 'TECCEUS', 'TEACEUS', 'TXEIEUS')
   GROUP BY CONCAT((date_year/10)::int * 10, 's'), 
             REPLACE(REPLACE(description, 'Total Energy ', ''), ' CO2 Emissions', '')
  )

SELECT c1.decade, c1."Sector", c1."Primary Energy Consumed", c2."Total CO2 Emissions"
FROM c1
LEFT JOIN c2
    ON c1."Sector" = c2."Sector" AND c1.decade = c2.decade
ORDER BY c1.decade, c1."Sector"
decade Sector Primary Energy Consumed Total CO2 Emissions
35 2000s Transportation Sector 546616.33 38377.90
36 2010s Commercial Sector 77171.71 17076.89
37 2010s Electric Power Sector 693591.66 35901.58
38 2010s Industrial Sector 388661.21 NA
39 2010s Residential Sector 117157.95 19437.04
40 2010s Transportation Sector 490149.36 33393.26

U.S. Primary Energy Consumption by Sector



plot_mat <- with(consumed_df, tapply(`Primary Energy Consumed`, list(decade, `Sector`), sum))
par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Primary Energy Consumption by Sector", cex.main=1.5,
        col=seaborn_palette[1:8], ylim=c(0, 9E5), xaxt="n", yaxt="n", beside=TRUE)

axis(side=1, at=c(5, 14, 23, 32, 41), labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:8], ncol=8)

U.S. CO2 Emissions by Sector

plot_mat <- with(consumed_df, tapply(`Total CO2 Emissions`, list(decade, `Sector`), sum))
plot_mat <- plot_mat[, c("Commercial Sector", "Electric Power Sector", 
                         "Residential Sector", "Transportation Sector"), drop=FALSE]
plot_mat <- plot_mat[complete.cases(plot_mat),,drop=FALSE]

par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. CO2 Emissions by Sector", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 6E4), xaxt="n", yaxt="n", beside=TRUE)

axis(side=1, at=c(3, 9, 15, 21)+0.5, labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

Parameterized Query

sql <- "WITH sub AS
          (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                  REPLACE($1, '%', '') as sector,
                  REPLACE(REPLACE(description, 
                                  CONCAT(REPLACE($2, '%', ''), ' CO2 Emissions'), 
                                  ''),
                          ' ', '\n')  AS \"Type\",
                 energy_co2
           FROM us_co2_emissions
           WHERE description LIKE $3)

       SELECT decade, sector, \"Type\", 
              SUM(energy_co2) AS \"Total CO2 Emissions\"
       FROM sub
       GROUP BY decade, sector, \"Type\"
       ORDER BY decade, sector, \"Type\"
       "

params <- paste(c("%Transportation", "%Residential", "%Commercial"), "Sector%")
emissions_type_df <- do.call(rbind, lapply(params, function(p) 
                                            dbGetQuery(conn, sql, param=list(p, p, p))))

U.S. CO2 Emissions Bar Plot by Fuel Type across Sectors

output <- by(emissions_type_df, emissions_type_df$sector, function(sub) {
  plot_mat <- with(sub, tapply(`Total CO2 Emissions`, list(decade, `Type`), sum))

  barplot(plot_mat, main=paste("U.S. CO2 Emissions by", sub$sector[[1]]), cex.main=1.5,
          col=seaborn_palette[1:5], ylim=c(0, ceiling(max(plot_mat, na.rm=TRUE)/1E4) * 1E4), 
          xaxt="n", yaxt="n", beside=TRUE)
  
  axis(side=1, at=c(3,9,15,21,27,33,39,45,51,57,63)[seq_along(colnames(plot_mat))] + 0.5,
       labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
  axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
  legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
})

Global Metrics


Multiple Aggregations

WITH pop AS 
  (SELECT p.year,
          p.population
   FROM world_population p
   WHERE p.country_name = 'World'
     AND p.year BETWEEN 2000 AND 2019
   ),
    
    land AS
  (SELECT a.year,
          a.percent_arable
   FROM arable_land a
   WHERE a.country_name = 'World'
     AND a.year BETWEEN 2000 AND 2019
   ),

    fauna AS
  (SELECT i.year,
          SUM(i.species_count) AS animals_count
   FROM iucn_species_count i
   WHERE i.year BETWEEN 2000 AND 2019
   GROUP BY i.year
   ),

    flora AS
  (SELECT p.assessment_year as year,
          COUNT(*) AS plants_count
   FROM plants_assessments p
   WHERE p.assessment_year BETWEEN 2000 AND 2019
     AND p.interpreted_status = 'Threatened'
   GROUP BY p.assessment_year
   ),             

    ice AS
  (SELECT s.date_year as year,
          AVG(s.extent) FILTER(WHERE s.region = 'Arctic') AS arctic_sea_ice_extent,
          AVG(s.extent) FILTER(WHERE s.region = 'Antarctica') AS antarctic_sea_ice_extent
   FROM sea_ice_extent s
   WHERE s.date_year BETWEEN 2000 AND 2019
   GROUP BY s.date_year
   ),
   
    ocean AS
  (SELECT o.year as year,
          AVG(o.tco2) AS total_co2,
          AVG(o.phts25p0) AS ph_scale
   FROM ocean_data o
   WHERE o.year BETWEEN 2000 AND 2019
     AND o.tco2 <> -9999 AND o.phts25p0 <> -9999
   GROUP BY o.year
   ),
   
    temp AS
  (SELECT g.year as year,
          AVG(g.global_mean) AS global_mean
   FROM global_temperature g
   WHERE g.year BETWEEN 2000 AND 2019
   GROUP BY g.year
   )
   
SELECT pop.year, pop.population, land.percent_arable, 
       fauna.animals_count, flora.plants_count,
       ice.arctic_sea_ice_extent, ice.antarctic_sea_ice_extent, 
       ocean.total_co2, ocean.ph_scale, temp.global_mean
FROM pop 
JOIN land USING (year)
JOIN fauna USING (year)
JOIN flora USING (year)
JOIN ice USING (year)
JOIN ocean USING (year)
JOIN temp USING (year)
ORDER BY pop.year
year population percent_arable animals_count plants_count arctic_sea_ice_extent antarctic_sea_ice_extent total_co2 ph_scale global_mean
7 2008 6765986891 10.80640 16928 3279 10.97785 12.23941 2181.326 7.685694 0.5155556
8 2009 6849272706 10.77478 17291 2294 10.93197 12.04859 2176.104 7.735574 0.6294444
9 2010 6932596129 10.74432 18351 3217 10.71139 12.10679 2191.072 7.702093 0.7022222
10 2011 7014792135 10.80189 19570 2850 10.48350 11.50057 2224.690 7.675097 0.5844444
11 2012 7099311892 10.87358 20219 7556 10.40610 12.00444 2187.112 7.731700 0.6183333
12 2013 7184861447 10.89469 21353 5465 10.89712 12.52361 2183.400 7.723483 0.6438889

Scatter Plots of World Population and Other Metrics with LOESS Fit

par(mfrow=c(4,2), mai = c(0.3, 0.2, 0.7, 0.2))

for(x in colnames(env_world_df)[3:ncol(env_world_df)]) {
  lfit <- loess(paste(x, "~ population"), data=env_world_df)
  
  plot(as.formula(paste(x, "~ population")), env_world_df, 
       main=paste("population and\n", gsub("_", " ", x)), cex.main=2,
       type="p", col=seaborn_palette[1], yaxt='n', xaxt='n', pch=19)
  axis(side=1, at=axTicks(1), labels=format(axTicks(1), big.mark=',', scientific=FALSE))
  axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE))
  pop_order <- order(env_world_df$population)
  lines(env_world_df$population[pop_order], lfit$fitted[pop_order], col=seaborn_palette[4], lwd=3)
}

Shiny Application

# LINUX SHELL COMMAND CALL
system(paste0("gnome-terminal -- Rscript -e \"library(shiny); setwd('", 
              getwd(), "'); 
              runApp('Postgres_As_Data_Science_DB_Shiny_App.R')\""))


PostgreSQL as tool in data science pipeline




  • Robust big data solution

  • Facilitates access and data management

  • Leverages query engine

  • Supports reproducible processs

  • Streamlined open source ecosystem